library("data.table")
library("haven")
library("arrow")
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

lib2021 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2021\\Format parquet\\"
lib2022 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2022\\"
lib2023 <- "\\\\casd.fr\\casdfs\\Projets\\INEPROG\\Data\\DADS_DADS Postes_2023\\"


#-----------------------------------------#
# Function ####
#-----------------------------------------#
    psid_use <- function (y,list,out="rds") {
          y_1 <- y-1      
          liby <- get(paste("lib",y,sep=""))
          files_y <- list.files(path = liby)
          files_y <- files_y[grep(paste("post_",y,sep=""),files_y)]
          end <- length(files_y)
          
          dads_y <- NULL
          print(liby)
          
          for (i in c(1:end)){
                  fff <- read_parquet(paste(liby,files_y[i],sep=""),
                                      col_select = c(ident_s,siren,nic,s_brut,regt,list))
                  fff <- fff[s_brut>0,]
                  dads_y <- rbind(dads_y,fff,fill=T)
                  print(files_y[i])
                  print(nrow(fff))
                  rm(fff)
          }
          psid_y <- read_sas(paste("C:\\Users\\Public\\Documents\\pseudo_id\\psid_",y,".sas7bdat",sep=""))
          colnames(psid_y) <- tolower(colnames(psid_y))
          
          dads_y <- merge(dads_y,psid_y,by="ident_s",all.x=T)
          rm(psid_y)
          dads_y[,ident_all:= ifelse(is.na(ident_all)==T | ident_all=="",
                                     ident_s*100+as.numeric(substr(y,3,4),sep=""),
                                           ident_all)]
          
          if (out=="dta") {
                      write_dta(dads_y,paste("dads_out/dads_",y,".dta",sep=""), version= 14)
          }else{      saveRDS(dads_y,paste("dads_out/dads_",y,".rds",sep="")) }
          rm(dads_y)
    }
    

#-----------------------------------------#
# Use ####
#-----------------------------------------#

   
  psid_use(2022,
           list=c("sexe","nbheur","duree","annee_naiss","apet","pcs"),
           out="rds")
  
